### ---- SCRIPT 06: The purpose of this script is to summarise the guest appearances on each pr0gramme. This includes --- ####
### ---- the proportion of guests on each programme by their category type, organisation, and organisation leaning. --- ####

options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # big data wrangling

#### ---- LOAD DATA ---- ####

# Twitter user data 
user_data <- fread("user_ideal_point_data_updated.csv")

# Guest masterlist
guest_data <- fread("username_master_list_with_ideal_points.csv")

# TV guest lists
guest_lists <- list.files("guest_lists/",full.names = TRUE) %>%
  lapply(fread)

#### ---- MERGE THE GUEST LISTS TOGETHER AND MATCH TO THEIR IDEAL POINTS ---- ####

# Concatenate the individual guest lists together into one dataset
guest_lists_merged <- rbindlist(guest_lists)

# Match each guest appearance in the dataset to their corresponding ideal point and the ideal point of their
# organisation and organisation affiliates. These will only be used where an individual does not have an ideal point
# of their own.
guest_lists_merged <- left_join(guest_lists_merged,guest_data,by=c("Name" = "name"))

#### ---- SUMMARISE GUEST SELECTIONS ON EACH SHOW BY CATEGORY TYPE AND ORGNISATION ---- ####

# Summarise the overall number of guest appearances by type
guest_catergories_summary <- guest_lists_merged %>%
  group_by(category_type) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)

# Summarise T.V show guest appearances by proportion of guests from each main category type
tv_show_guest_categories <- guest_lists_merged %>%
  group_by(Show,category_type) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)

# Order shows by proportion of political guests for benefit of plot interpretability, as this is the biggest group
shows_plot_ordering <- tv_show_guest_categories %>% 
  filter(category_type == "Political") %>%
  arrange(percentage) %>%
  pull(Show)

tv_show_guest_categories$Show <- factor(tv_show_guest_categories$Show, levels = shows_plot_ordering)
tv_show_guest_categories$category_type <- factor(tv_show_guest_categories$category_type, 
                                                 levels = guest_catergories_summary$category_type[order(guest_catergories_summary$count, decreasing = FALSE)])

# Plot as a stacked bar chart
tv_guest_categories_plot <- ggplot(tv_show_guest_categories, aes(x = Show, y = percentage, fill = category_type)) +
  geom_bar(stat = "identity", position = "stack", width = 0.5, alpha = 0.8) +
  labs(title = "",
       x = "",
       y = "Percentage",
       fill = "") +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip() +
  theme(legend.position = "right")

ggsave("guest_category_type_by_show.png",
       tv_guest_categories_plot,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Summarise T.V show guest appearances by proportion of guests from each main category type *and* political leaning
guest_lists_merged$organisation_leaning[guest_lists_merged$organisation_leaning == "RIght"] <- "Right"
guest_lists_merged$organisation_leaning[guest_lists_merged$organisation_leaning == "Centre/Neutral/Independent"] <- "Centre/Neutral"
guest_lists_merged$organisation_leaning[guest_lists_merged$organisation_leaning == "N/A"] <- "Unk."

unique(guest_lists_merged$organisation_leaning)

tv_show_guest_leaning <- guest_lists_merged %>%
  group_by(Show,category_type,organisation_leaning) %>%
  summarise(count = n()) %>%
  arrange(Show) %>%
  group_by(Show) %>%
  mutate(percentage = count / sum(count) * 100,
         cum_perc = cumsum(percentage))

tv_show_guest_leaning$org_lean_merged <- paste(tv_show_guest_leaning$category_type,tv_show_guest_leaning$organisation_leaning)

tv_show_guest_leaning_coords <- tv_show_guest_leaning %>% 
  select(Show,org_lean_merged,percentage)

# Order the categories by most occuring
guest_categories_ordered <- tv_show_guest_leaning %>%
  group_by(org_lean_merged) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  pull(org_lean_merged)

tv_show_guest_leaning_coords$org_lean_merged <- factor(tv_show_guest_leaning_coords$org_lean_merged, 
                                                   levels = rev(guest_categories_ordered))

shows_ideal_point_plot_ordering <- guest_lists_merged %>% 
  group_by(Show) %>%
  summarise(median_ideal_point = median(user_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(Show)

tv_show_guest_leaning_coords$Show <- factor(tv_show_guest_leaning_coords$Show,
                                            levels = rev(shows_ideal_point_plot_ordering))

tv_guests_heatmap <- ggplot(tv_show_guest_leaning_coords, aes(Show, org_lean_merged, fill= percentage)) + 
  geom_tile() +
  labs(x = "",
       y = "",
       fill = "Percentage",
       title = "Guest Category and Organisation Political Leaning") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        axis.title = element_blank(),
        panel.grid = element_blank())  + 
  scale_fill_gradientn(colours = c("white","grey","black"), values = c(0,0.1,1))

ggsave("guests_leaning_by_show_heatmap.png",
       tv_guests_heatmap,
       units="in", width=7, height=7, dpi=300,
       bg="white")

# By Organisation
guest_lists_merged$organisation[guest_lists_merged$organisation == "N/A"] <- "Unaffiliated"

# Take top 30 organisations that appear, fold all others into one category ("Other")
organisations_top30 <- guest_lists_merged %>%
  group_by(organisation) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  head(30)

guest_lists_merged$organisation[!guest_lists_merged$organisation %in% organisations_top30$organisation] <- "Other"

tv_show_organisation <- guest_lists_merged %>%
  group_by(Show,organisation) %>%
  summarise(count = n()) %>%
  arrange(Show) %>%
  group_by(Show) %>%
  mutate(percentage = count / sum(count) * 100,
         cum_perc = cumsum(percentage))

tv_show_organisation_coords <- tv_show_organisation %>% 
  select(Show,organisation,percentage)

organisations_ordered <- append(organisations_top30$organisation,"Other")

tv_show_organisation_coords$organisation <- factor(tv_show_organisation_coords$organisation, 
                                                   levels = rev(organisations_ordered))

tv_show_organisation_coords$Show <- factor(tv_show_organisation_coords$Show, 
                                           levels = c("BBC One: QT","BBC One: SwLK","BBC Two: PL",
                                                      "ITV: Peston","Channel 4: ANS","Sky: SRoS",
                                                      "GB News: CTS"))

tv_orgs_heatmap <- ggplot(tv_show_organisation_coords, aes(Show, organisation, fill= percentage)) + 
  geom_tile() +
  labs(x = "",
       y = "",
       fill = "Percentage",
       title = "Guest Organisation Represented") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        axis.title = element_blank(),
        panel.grid = element_blank())  + 
  scale_fill_gradientn(colours = c("white","grey","black"), values = c(0,0.1,1))


ggsave("guests_organisation_heatmap.png",
       tv_orgs_heatmap,
       units="in", width=7, height=7, dpi=300,
       bg="white")

#### ---- END ---- ####
